Add support for batched tasks. by bosilca · Pull Request #668 · ICLDisco/parsec

bosilca · 2024-09-10T03:51:56Z

The idea is the following:

tasks incarnations (aka. BODY) can be marked with the "batch" property allowing the runtime to provide the task with the entire list of ready tasks of the execution stream instead of just extracting the head.
this task kernel can acquire additional tasks from the stream's ready_list, and build a ring. While the batch group will be submitted for execution (user responsibility), the rest of the tasks, that remains in the stream's ready_list, will be added back into the stream pending list, in the order in which they were provided originally.
upon completion the user will return the ring of tasks and the runtime will merge it properly for the next stage.

devreal · 2026-05-11T15:57:21Z

I might have reviewed a stale version (before the last commit). I like the approach and I think we can integrate that into TTG. The docs that were just added should be good to work this out.

devreal · 2026-05-11T17:31:43Z

+    current_gpu_task = gpu_task;
+    do {
+        double *A, *B, *C;
+        int m, n, k, mb, nb, kb;
+        parsec_task_t *this_task = current_gpu_task->ec;
+        struct timeval start, end, diff;
+        double delta;
+        double *a_gpu, *b_gpu, *c_gpu;
+
+        parsec_dtd_unpack_args(this_task,
+                               &A, &B, &C,
+                               &m, &n, &k,
+                               &mb, &nb, &kb);
+        (void)A; (void)B; (void)C;
+
+        a_gpu = parsec_dtd_get_dev_ptr(this_task, 0);
+        b_gpu = parsec_dtd_get_dev_ptr(this_task, 1);
+        c_gpu = parsec_dtd_get_dev_ptr(this_task, 2);

-    PARSEC_CUDA_CHECK_ERROR("cublasDgemm_v2", status,
-                            { return PARSEC_HOOK_RETURN_ERROR; });
+        gettimeofday(&start, NULL);
+
+        status = cublasDgemm_v2(handle,
+                                CUBLAS_OP_N, CUBLAS_OP_N,
+                                mb, nb, kb,
+                                one_device, a_gpu, mb,
+                                b_gpu, kb,
+                                one_device, c_gpu, mb);
+        gettimeofday(&end, NULL);
+        timersub(&end, &start, &diff);
+        delta = (double)diff.tv_sec + (double)diff.tv_usec/1e6;
+        if(verbose) {
+            fprintf(stderr, "GEMM(%d, %d, %d) with tiles of %dx%d, %dx%d, %dx%d on node %d, GPU %s submitted in %g s%s\n",
+                    m, n, k, mb, kb, kb, nb, mb, kb,
+                    this_task->taskpool->context->my_rank,
+                    gpu_stream->name, delta,
+                    batch_count > 1 ? " as part of a batch" : "");
+        }
+
+        PARSEC_CUDA_CHECK_ERROR("cublasDgemm_v2", status,
+                                { return PARSEC_HOOK_RETURN_ERROR; });
+
+        current_gpu_task = (parsec_gpu_task_t *)current_gpu_task->list_item.list_next;
+    } while( current_gpu_task != gpu_task );
+
+    if( verbose && batch_count > 1 ) {
+        fprintf(stderr, "Submitted %d batched GEMM tasks on GPU stream %s\n",
+                batch_count, gpu_stream->name);
+    }


I think having an example that actually batches GEMM calls would be helpful. Here is one (untested, written in the browser)

Suggested change

current_gpu_task = gpu_task;

do {

double *A, *B, *C;

int m, n, k, mb, nb, kb;

parsec_task_t *this_task = current_gpu_task->ec;

struct timeval start, end, diff;

double delta;

double *a_gpu, *b_gpu, *c_gpu;

parsec_dtd_unpack_args(this_task,

&A, &B, &C,

&m, &n, &k,

&mb, &nb, &kb);

(void)A; (void)B; (void)C;

a_gpu = parsec_dtd_get_dev_ptr(this_task, 0);

b_gpu = parsec_dtd_get_dev_ptr(this_task, 1);

c_gpu = parsec_dtd_get_dev_ptr(this_task, 2);

PARSEC_CUDA_CHECK_ERROR("cublasDgemm_v2", status,

{ return PARSEC_HOOK_RETURN_ERROR; });

gettimeofday(&start, NULL);

status = cublasDgemm_v2(handle,

CUBLAS_OP_N, CUBLAS_OP_N,

mb, nb, kb,

one_device, a_gpu, mb,

b_gpu, kb,

one_device, c_gpu, mb);

gettimeofday(&end, NULL);

timersub(&end, &start, &diff);

delta = (double)diff.tv_sec + (double)diff.tv_usec/1e6;

if(verbose) {

fprintf(stderr, "GEMM(%d, %d, %d) with tiles of %dx%d, %dx%d, %dx%d on node %d, GPU %s submitted in %g s%s\n",

m, n, k, mb, kb, kb, nb, mb, kb,

this_task->taskpool->context->my_rank,

gpu_stream->name, delta,

batch_count > 1 ? " as part of a batch" : "");

}

PARSEC_CUDA_CHECK_ERROR("cublasDgemm_v2", status,

{ return PARSEC_HOOK_RETURN_ERROR; });

current_gpu_task = (parsec_gpu_task_t *)current_gpu_task->list_item.list_next;

} while( current_gpu_task != gpu_task );

if( verbose && batch_count > 1 ) {

fprintf(stderr, "Submitted %d batched GEMM tasks on GPU stream %s\n",

batch_count, gpu_stream->name);

}

current_gpu_task = gpu_task;

// NOTE: assuming all tasks have the same shape. Special handling needed otherwise.

int m, n, k, mb, nb, kb;

double** As = malloc(batch_count*sizeof(*As));

double** Bs = malloc(batch_count*sizeof(*Bs));

double** Cs = malloc(batch_count*sizeof(*Cs));

for (int i = 0; i < batch_count; ++i) {

double *A, *B, *C;

parsec_task_t *this_task = current_gpu_task->ec;

struct timeval start, end, diff;

double delta;

double *a_gpu, *b_gpu, *c_gpu;

parsec_dtd_unpack_args(this_task,

&A, &B, &C,

&m, &n, &k,

&mb, &nb, &kb);

(void)A; (void)B; (void)C;

As[i] = parsec_dtd_get_dev_ptr(this_task, 0);

Bs[i] = parsec_dtd_get_dev_ptr(this_task, 1);

Cs[i] = parsec_dtd_get_dev_ptr(this_task, 2);

current_gpu_task = (parsec_gpu_task_t *)current_gpu_task->list_item.list_next;

} while( current_gpu_task != gpu_task );

gettimeofday(&start, NULL);

status = cublasDgemmBatched(handle,

CUBLAS_OP_N, CUBLAS_OP_N,

mb, nb, kb,

one_device, As, mb,

Bs, kb,

one_device, Cs, mb,

batch_count);

gettimeofday(&end, NULL);

timersub(&end, &start, &diff);

delta = (double)diff.tv_sec + (double)diff.tv_usec/1e6;

if(verbose) {

fprintf(stderr, "GEMM(%d, %d, %d) with tiles of %dx%d, %dx%d, %dx%d on node %d, GPU %s submitted in %g s%s\n",

m, n, k, mb, kb, kb, nb, mb, kb,

this_task->taskpool->context->my_rank,

gpu_stream->name, delta,

batch_count > 1 ? " as part of a batch" : "");

}

PARSEC_CUDA_CHECK_ERROR("cublasDgemmBatched", status,

{ return PARSEC_HOOK_RETURN_ERROR; });

if( verbose && batch_count > 1 ) {

fprintf(stderr, "Submitted %d batched GEMM tasks on GPU stream %s\n",

batch_count, gpu_stream->name);

}

I'm sorry but this is not parsec role to batch gemm tasks. There is a clear way to do it, an example on how to do it. That should be more than enough for any PR !

That being said, this would be a really nice addition to DPLASMA ! Looking forward to your PR.

What is the benefit of "batching" kernels one by one into the same stream vs submitting them the same way we do today? I don't see how the above will yield any benefits so it's not a good example.

In the worst case we are saving a CUDA event recording and tracking per "batched" task. Something you were very supportive for the PR that saved one event for tasks "with all input arguments available". And here we might save more than one, so even in the worst case still a positive outcome.

Not quite. Here we're saving one at most per task (the event on the execution stream) but still have the input and output ones for each task. In #681 we are potentially saving two per task (the input and the output events). Combined, this can significantly reduce the number of events overall.

There is absolutely no requirement to have the input/output event for each task. This optimization is actually complementary to the one in #681.

devreal · 2026-05-11T17:34:35Z

+For PTG-generated tasks, use the `batch = true` body property on a device body:
+
+```c
+BODY [type=CUDA
+      batch = true
+      dyld=cublasDgemm dyldtype=cublas_dgemm_t]
+{
+    /* GPU submit body. */
+}


It's not clear how PTG tasks can actually batch kernel invocations. Simply stringing kernels together on the same stream won't save much.

The same way as DTD tasks, and the same way we did two years ago for the GB submission.

GB submissions are not part of the docs.

This PR has been lingering here for a very long time. Let's get it in.

The idea is the following: - tasks incarnations (aka. BODY) can be marked with the "batch" property allowing the runtime to provide the task with the entire list of ready tasks of the execution stream instead of just extracting the head. - this list of ready tasks is in fact a ring, that can then be trimmed by the kernel and divided into batch and the rest. The rest of the tasks will be left in the ring, while the batch group will be submitted for execution. - the kernel also needs to provide a callback into the gpu_task complete_stage, such that the runtime can call the specialized function able to complete all batched tasks. Signed-off-by: George Bosilca <gbosilca@nvidia.com>

Replace the CUDA-specific batch build switch with PARSEC_HAVE_DEV_CAPABILITY_BATCH so batching is a runtime capability shared by all supported device types. Export the new option through parsec_options and PaRSECConfig. Add per-device MCA parameters to disable batching for CPU, recursive, CUDA, HIP, and Level Zero devices. Use shared helpers to sanitize batch chore types in DTD and to gate GPU task-ring batching on the selected device. Teach PTG to accept batch=true for CPU/default bodies as well as typed device bodies, and add CPU batch examples for both PTG and DTD with ctest coverage for the enabled and CPU-disabled DTD paths. Signed-off-by: George Bosilca <gbosilca@nvidia.com>

Add parsec_gpu_task_collect_batch() so GPU submit hooks can build a batched task ring by providing a compatibility callback, without directly manipulating the stream pending FIFO. Always pass a singleton task to the submit hook, and automatically chain a returned non-singleton task ring into the next GPU stream pending queue. This removes the need for user-provided completion callbacks whose only purpose was to merge the batched ring back into fifo_pending. Update the CUDA GEMM and stage_custom tests to use the new collector helper and drop their open-coded FIFO iteration and batched completion callbacks. Use PaRSEC's singleton terminology for the internal task-ring check. Signed-off-by: George Bosilca <gbosilca@nvidia.com>

Add Doxygen documentation for device task batching, including how to enable batching for PTG and DTD device chores. Describe the recommended parsec_gpu_task_collect_batch() interface, its callback contract, return values, and the runtime behavior for returned non-singleton task rings. Also document the lower-level direct FIFO/ring manipulation approach for expert users that need to avoid the callback overhead in high-load scenarios. Link the new page from the main Doxygen page and include it in the Doxygen input list. Signed-off-by: George Bosilca <gbosilca@nvidia.com>

devreal

I guess we'll see benefits in DPLASMA once it's picked up there (not volunteering by any means...)

bosilca requested a review from a team as a code owner September 10, 2024 03:51

bosilca force-pushed the topic/batched_tasks branch from 2c004b1 to fffc3ec Compare September 10, 2024 04:02

bosilca mentioned this pull request Sep 10, 2024

Add support for batched tasks and for CUDA-aware communications bosilca/parsec#4

Open

bosilca force-pushed the topic/batched_tasks branch from fffc3ec to 9998554 Compare September 10, 2024 05:33

abouteiller self-requested a review October 11, 2024 15:38

abouteiller reviewed Feb 26, 2025

View reviewed changes

Comment thread parsec/mca/device/device_gpu.c Outdated

Comment thread parsec/interfaces/ptg/ptg-compiler/jdf2c.c Outdated

bosilca force-pushed the topic/batched_tasks branch from 88bbbd3 to 41fa201 Compare May 9, 2026 06:32

bosilca requested a review from abouteiller May 9, 2026 06:42

bosilca force-pushed the topic/batched_tasks branch 2 times, most recently from e56075b to 3783e42 Compare May 11, 2026 13:49

devreal requested changes May 11, 2026

View reviewed changes

Comment thread parsec/mca/device/device.c Outdated

Comment thread parsec/mca/device/device.h Outdated

Comment thread parsec/mca/device/device_gpu.c Outdated

Comment thread tests/dsl/dtd/dtd_test_simple_gemm.c Outdated

Comment thread parsec/mca/device/device_gpu.c

bosilca force-pushed the topic/batched_tasks branch 4 times, most recently from 7dd4d27 to b0a641e Compare May 11, 2026 16:41

devreal reviewed May 11, 2026

View reviewed changes

bosilca force-pushed the topic/batched_tasks branch from b0a641e to 6e5f669 Compare May 11, 2026 18:17

bosilca added 4 commits May 11, 2026 14:58

bosilca force-pushed the topic/batched_tasks branch from 6e5f669 to a30718b Compare May 11, 2026 18:58

devreal approved these changes May 11, 2026

View reviewed changes

bosilca merged commit 6751bd7 into ICLDisco:master May 11, 2026
12 checks passed

Conversation

bosilca commented Sep 10, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

devreal commented May 11, 2026

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

bosilca May 11, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

devreal left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

bosilca commented Sep 10, 2024 •

edited

Loading

bosilca May 11, 2026 •

edited

Loading